This code follows publications_LENS_preprocessing.Rmd which processes the list of the 1019 included articles stored in 20201215_EGM_Net_all-articles_clean.csv and runs a query via LENS API custom function to retrieve their detailed bibliographic information. Retrieved data is processed to remove duplicated records, resulting in 974 records with unique titles. The data is stored as a hierarchical data object with multiple levels of nested data tables (record_df in LENS_dataframe.RData).
This code produces collaboration plots and statistics using igraph package.
#load(file = here("data", "LENS_dataframe_cleaned.RData")) #loads semi-cleaned LENS output from a Rdata object - "record_df"
dat <- read.csv(here("data", "20201215_EGM_Net_all-articles_clean.csv"))
dim(dat) #1019 16
## [1] 1019 16
#names(dat)
#hist(dat$year)
# #initial checks
# length(unique(dat$Title)) #1019
# title <- unique(dat$Title)
# length(unique(dat$DOI)) #only 184 DOI values and missing the value is stored as ""! (change to NA later?)
# table(dat$Item_Type) #mostly articles
# hist(dat$Pub_Year, breaks = 70)
# table(dat$Manual_Tags) #stored as vectors of characters, tags separated by "; "
# #table(dat$Item_Type)["journalArticle"] # 959 journal articles
Countries codes are mostly cleaned now. We use two-letter country codes (fips) from https://www.geonames.org/countries/.
record_df_data.authors.aff <- read.csv(here("data", "record_df_data.authors.aff_cleaned.csv"))
#names(record_df_data.authors.aff)
record_df_data.authors.aff$Author <- paste(record_df_data.authors.aff$last_name, record_df_data.authors.aff$initials, sep=", ") #this would need some extra cleaning to perfectly match Author field in record_df_data.authors.ids
#prepare data frame for igraph
dtc <- data.frame(pub.id = record_df_data.authors.aff$data.lens_id, Author = record_df_data.authors.aff$Author, value = record_df_data.authors.aff$country_code)
#str(dtc)
dtc %>%
inner_join(dtc, by = "pub.id") %>%
filter(Author.x < Author.y) %>%
count(value.x, value.y) %>%
graph_from_data_frame(directed = FALSE) -> g1c
#as_data_frame(g1c, what = "edges")
# plot(g1c)
# plot(g1c, edge.arrow.size=0, vertex.color="gold", vertex.size=5,
# vertex.frame.color="gray", vertex.label.color="black",
# vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)
#E(g1c)$weight <- 1 #add weights
g1cs <- simplify(g1c, remove.multiple = T, remove.loops = T,
edge.attr.comb=list(weight="sum", "ignore") ) #simplify
plot(g1cs, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=10, vertex.frame.color="gray",
vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=0, edge.curved=0.2)
#E(g1cs)
#V(g1cs)
##Find cliques (complete subgraphs of an undirected graph)
# cliques(as.undirected(g1cs)) # list of cliques
# sapply(cliques(as.undirected(g1cs)), length) # clique sizes
# largest_cliques(as.undirected(g1cs)) # clique with max number of nodes
# vcol <- rep("grey80", vcount(as.undirected(g1cs)))
# vcol[unlist(largest_cliques(as.undirected(g1cs)))] <- "gold"
# plot(as.undirected(g1cs), vertex.label=V(g1cs)$name, vertex.color=vcol)
##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.
cebc <- cluster_edge_betweenness(as.undirected(g1cs))
#dendPlot(cebi, mode="hclust") #too dense
plot(cebc, as.undirected(g1cs), layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
vertex.frame.color="grey", vertex.label.color="black") #, vertex.label=NA
#more at: https://kateto.net/netscix2016.html, pretty_plots.R
Note: needs to be redone after cleaning and imputing instituton data. https://www.grid.ac/ Global Research Identifier Database (GRID) with institution names (affiliation name).
record_df_data.authors.aff <- read.csv(here("data", "record_df_data.authors.aff_cleaned.csv"))
#dim(record_df_data.authors.aff) #2968
## make a subset without missing grid_id data:
record_df_data.inst <- record_df_data.authors.aff[!is.na(record_df_data.authors.aff$grid_id), ]
#dim(record_df_data.inst) #2188 records
#using country code and institution grid.id combined as a unique identifier
record_df_data.inst$country_grid.id <- paste(record_df_data.inst$country_code, record_df_data.inst$grid_id, sep=", ")
record_df_data.inst$Author <- paste(record_df_data.inst$last_name, record_df_data.inst$initials, sep=", ")
#prepare data frame for igraph
dti <- data.frame(pub.id = record_df_data.inst$data.lens_id, Author = record_df_data.inst$Author, value = record_df_data.inst$country_grid.id)
#str(dti)
dti %>%
inner_join(dti, by = "pub.id") %>%
filter(Author.x < Author.y) %>%
count(value.x, value.y) %>%
graph_from_data_frame(directed = FALSE) -> g1i
#as_data_frame(g1i, what = "edges")
# plot(g1i)
# plot(g1i, edge.arrow.size=0, vertex.color="gold", vertex.size=5,
# vertex.frame.color="gray", vertex.label.color="black",
# vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)
#E(g1i)$weight <- 1 #add weights
g1is <- simplify(g1i, remove.multiple = T, remove.loops = T,
edge.attr.comb=list(weight="sum", "ignore") ) #simplify
plot(g1is, layout=layout_nicely, edge.arrow.size=0, vertex.color="gold", vertex.size=5, vertex.frame.color="gray",
vertex.label.color="black", vertex.label.cex=0.8, vertex.label.dist=2, edge.curved=0.2)
#E(g1is)
#V(g1is)
##Find cliques (complete subgraphs of an undirected graph)
# cliques(as.undirected(g1is)) # list of cliques
# sapply(cliques(as.undirected(g1is)), length) # clique sizes
largest_cliques(as.undirected(g1is)) # clique with max number of nodes
## [[1]]
## + 5/299 vertices, named, from 190d589:
## [1] ES, grid.5319.e CH, grid.5333.6 GB, grid.8250.f NL, grid.5477.1
## [5] RO, grid.5100.4
##
## [[2]]
## + 5/299 vertices, named, from 190d589:
## [1] US, grid.463419.d US, grid.34421.30 US, grid.472551.0 ES, grid.4711.3
## [5] US, grid.167436.1
##
## [[3]]
## + 5/299 vertices, named, from 190d589:
## [1] UK, grid.423196.b GB, grid.421630.2 GB, grid.1006.7 GB, grid.11835.3e
## [5] GB, grid.4991.5
##
## [[4]]
## + 5/299 vertices, named, from 190d589:
## [1] SK, grid.412903.d DE, grid.7450.6 DE, grid.8379.5 DE, grid.9122.8
## [5] HU, grid.5591.8
##
## [[5]]
## + 5/299 vertices, named, from 190d589:
## [1] SE, grid.6341.0 GB, grid.9435.b DE, grid.8379.5 NL, grid.4818.5
## [5] SE, grid.4514.4
##
## [[6]]
## + 5/299 vertices, named, from 190d589:
## [1] RO, grid.5100.4 GB, grid.8250.f NL, grid.5477.1 ES, grid.5841.8
## [5] PL, grid.413454.3
##
## [[7]]
## + 5/299 vertices, named, from 190d589:
## [1] GB, grid.8391.3 GB, grid.418374.d GB, grid.43641.34 GB, grid.9835.7
## [5] GB, grid.5491.9
##
## [[8]]
## + 5/299 vertices, named, from 190d589:
## [1] GB, grid.8391.3 GB, grid.418374.d GB, grid.43641.34 GB, grid.9835.7
## [5] GB, grid.421944.e
# vcol <- rep("grey80", vcount(as.undirected(g1is)))
# vcol[unlist(largest_cliques(as.undirected(g1is)))] <- "gold"
# plot(as.undirected(g1is), vertex.label=V(g1is)$name, vertex.color=vcol)
##Community detection based on edge betweenness (Newman-Girvan)
##High-betweenness edges are removed sequentially (recalculating at each step) and the best partitioning of the network is selected.
cebi <- cluster_edge_betweenness(as.undirected(g1is))
#dendPlot(cebi, mode="hclust") #too dense
plot(cebi, as.undirected(g1is), layout=layout_nicely, edge.arrow.size=0.0, vertex.color="gold", vertex.size=2,
vertex.frame.color="grey", vertex.label.color="black", vertex.label=NA)
#more at: https://kateto.net/netscix2016.html, pretty_plots.R